#!/bin/bash
#
# For TwoStageCollective
# This script generates GNUPlot data/script files for plotting the error of
# 2. classifier on the labeled data (x-axis) against the error of the 
# 1. classifier on the unlabeled data (x-axs)
#
# FracPete


# the usage of this script
function usage()
{
   echo 
   echo "usage: ${0##*/} -i <dir> -o <dir> [-l <length>] [-e] [-h]"
   echo "       [-H <host>] [-P <port>] [-u <user>] [-p <password>]"
   echo "       [-D <database>] [-r] [-s] [-t <title>] [-T <type>]"
   echo
   echo "Generates GNUPlots from RMS files generated by TwoStageCollective"
   echo 
   echo " -h   this help"
   echo " -i   <dir>"
   echo "      the directory containing the CSV files"
   echo "      current: $INPUT"
   echo " -o   <dir>"
   echo "      the output directory for the plot files etc."
   echo "      current: $OUTPUT"
   echo " -l   <length>"
   echo "      determines the length of the MD5 of the options to print "
   echo "      current: $LENGTH"
   echo " -e   execute the gnuplot scripts"
   echo " -H   <host>"
   echo "      the DB host to connect to"
   echo "      current: $DBHOST"
   echo " -P   <port>"
   echo "      the port MySQL is listening on"
   echo "      current: $DBPORT"
   echo " -u   <user>"
   echo "      the DB user to connect as"
   echo "      current: $DBUSER"
   echo " -p   <password>"
   echo "      the password to use for authentication"
   echo "      current: $DBPW"
   echo " -D   <database>"
   echo "      the database to connect to"
   echo " -r   use human-readable captions, not MD5 (needs DB access)"
   echo "      current: $READABLE"
   echo " -s   generates really short human-readable captions, in"
   echo "      combination with '-r'"
   echo " -t   <title>"
   echo "      the title prefix for the axes"
   echo "      current: $TITLE"
   echo " -T   <type>"
   echo "      the type of data to print (1=RMS, 2=Perc. Misclassifications)"
   echo "      current: $TYPE"
   echo 
}

# transposes the file IN into OUT
function transpose()
{
   cat $IN | sed s/","/" "/g | exec awk '
     NR == 1 {
         n = NF
            for (i = 1; i <= NF; i++)
                  row[i] = $i
                     next
     }
   {
      if (NF > n)
            n = NF
               for (i = 1; i <= NF; i++)
                     row[i] = row[i] " " $i
   }
   END {
      for (i = 1; i <= n; i++)
            print row[i]
   }' ${1+"$@"} > $OUT
}

# executes a query SQL and stores output in file OUT
function exec_sql()
{
   mysql --host=$DBHOST --port=$DBPORT --user=$DBUSER --password=$DBPW --database="$DATABASE" --execute="$SQL" > "$OUT"
}

# variables
INPUT="."
OUTPUT="."
EXECUTE="no"
LENGTH="16"
DBHOST="localhost"
DBPORT="3306"
DBUSER="nobody"
DBPW=""
DATABASE=""
READABLE="no"
SHORT="no"
TITLE="RMS"
TITLE_SET="no"
TYPE="1"

# interprete parameters
while getopts ":hi:o:H:P:u:p:D:t:T:ers" flag
do
   case $flag in
      i) INPUT=$OPTARG
         ;;
      o) OUTPUT=$OPTARG
         ;;
      H) DBHOST=$OPTARG
         ;;
      P) DBPORT=$OPTARG
         ;;
      u) DBUSER=$OPTARG
         ;;
      p) DBPW=$OPTARG
         ;;
      D) DATABASE=$OPTARG
         ;;
      T) TYPE=$OPTARG
         ;;
      t) TITLE=$OPTARG
         TITLE_SET="yes"
         ;;
      e) EXECUTE="yes"
         ;;
      r) READABLE="yes"
         ;;
      s) SHORT="yes"
         ;;
      h) usage
         exit 0
         ;;
      *) usage
         exit 1
         ;;
   esac
done

# everything provided?
if [ "$INPUT" = "" ] || [ "$OUTPUT" = "" ]
then
   echo
   echo "ERROR: not all parameters provided!"
   echo
   usage
   exit 2
fi
if [ "$READABLE" = "yes" ]
then
   if [ "$DBHOST" = "" ] || [ "$DATABASE" = "" ]
   then
     echo
     echo "ERROR: not all parameters provided!"
     echo
     usage
     exit 2
   fi
fi
if [ ! "$TYPE" = "1" ] && [ ! "$TYPE" = "2" ]
then
   echo
   echo "ERROR: unknown type '-T $TYPE'!"
   echo
   usage
   exit 3
fi

# some dependent variables
if [ "$TITLE_SET" = "no" ]
then
   if [ "$TYPE" = "1" ]
   then
      TITLE="RMS"
   elif [ "$TYPE" = "2" ]
   then
      TITLE="% Miscl."
   fi
fi

if [ "$TYPE" = "1" ]
then
   SUFFIX="rms"
elif [ "$TYPE" = "2" ]
then
   SUFFIX="miscls"
fi
TMPNAME1=$SUFFIX"_train"
TMPNAME2=$SUFFIX"_test-original_1"

# process
echo
echo "Processing '$INPUT'"

# determine datasets
echo "Determining datasets..."
DATASETS=`find $INPUT -name "*-$TMPNAME1.csv" | sed s/"^.*weka\.[a-zA-Z0-9\.]*-"//g | sed s/"-R.*$"//g | sed s/"^[a-f0-9]*-"//g | sort -u`

# generate short classifier name, instead of MD5
if [ "$READABLE" = "yes" ]
then
   # info
   echo "Generating options key..."

   OPTIONS="$OUTPUT/options.tmp"
   
   # retrieve all distinct options
   SQL="select distinct summary,key_scheme_options from Results0 where summary not like '% %' order by key_scheme_options"
   OUT=$OPTIONS"_"
   exec_sql
   cat $OPTIONS"_" | grep -v "key_scheme_options" > $OPTIONS

   # get last classifier of meta classifier
   cat $OPTIONS | cut -f2 | sed s/".*weka\.classifiers\."//g | sed s/" ".*//g | sed s/".*\."//g | grep -n "." > $OPTIONS"2"

   # make names really short?
   if [ "$SHORT" = "yes" ]
   then
      cat $OPTIONS"2" | sed s/"[a-z]*"//g > $OPTIONS"2_"
      cp $OPTIONS"2_" $OPTIONS"2"
   fi

   # generate numbered classifiers
   cat $OPTIONS"2" | cut -f1 -d":" > $OPTIONS"3"
   cat $OPTIONS"2" | cut -f2 -d":" > $OPTIONS"4"
   paste --delimiter="_" $OPTIONS"4" $OPTIONS"3" > $OPTIONS"5"
   paste $OPTIONS $OPTIONS"5" > $OUTPUT/options.txt

   # generate key
   cat $OUTPUT/options.txt | cut -f2 > $OPTIONS"6"
   cat $OUTPUT/options.txt | cut -f3 > $OPTIONS"7"
   paste --delimiter="=" $OPTIONS"7" $OPTIONS"6" | sed s/"="/" = "/g | sort > $OUTPUT/options.key

   # clean up
   rm -f $OPTIONS*
fi

echo "Processing data..."
for DATASET in $DATASETS 
do
   # delete output files
   DATAFILE="$OUTPUT/$DATASET.dat"
   GNUPLOT="$OUTPUT/$DATASET.gnuplot"
   rm -f $GNUPLOT
   rm -f $DATAFILE
   
   # classifier count
   CNT=0

   for CSV in $INPUT/weka.*-$DATASET-*-$TMPNAME1.csv
   do
      # is name ambigious, i.e., substring of one another?
      TMP=`echo $CSV | sed s/"^.*weka\.[a-zA-Z0-9\.]*-"//g | sed s/"-R.*$"//g | sed s/"^[a-f0-9]*-"//g`
      LEN1=`echo $DATASET | wc -m | sed s/" "*//g`
      LEN2=`echo $TMP | wc -m | sed s/" "*//g`
      if [ ! $LEN1 -eq $LEN2 ]
      then
         continue
      fi
      
      CNT=$((CNT + 1))

      # determine prefix
      NAME=`echo $CSV | sed s/"-$TMPNAME1\\.csv"//g | sed s/".*\/"//g`
      MD5=`echo $CSV | sed s/.*weka[^-]*-//g | sed s/"-.*"//g`
      if [ "$READABLE" = "no" ]
      then
         CLASSIFIER=`echo $MD5 | cut -b1-$LENGTH`
      else
         CLASSIFIER=`cat $OUTPUT/options.txt | grep "$MD5" | cut -f3`
      fi
      echo "- $DATASET/$CLASSIFIER"

      # transpose files
      IN="$INPUT/$NAME-$TMPNAME1.csv";OUT="$OUTPUT/$NAME-$TMPNAME1.t";transpose
      IN="$INPUT/$NAME-$TMPNAME2.csv";OUT="$OUTPUT/$NAME-$TMPNAME2.t";transpose

      # empty files
      if [ $CNT -eq 1 ]
      then
         TMP="#"
      else
         TMP=""
      fi
      echo "$TMP x $CLASSIFIER $CNT" > "$OUTPUT/$NAME-$TMPNAME1.onecol"
      echo " y $CLASSIFIER $CNT" > "$OUTPUT/$NAME-$TMPNAME2.onecol"

      # determine columns
      COLUMNS=`head -n1 "$OUTPUT/$NAME-$TMPNAME1.t" | wc -w | sed s/" "*//g`

      # paste different runs in one column
      for ((COL = 1; COL <= $COLUMNS; COL++))
      do
         # x-axis
         cut -f$COL -d" " "$OUTPUT/$NAME-$TMPNAME1.t" >> "$OUTPUT/$NAME-$TMPNAME1.onecol"

         # y-axis
         cut -f$COL -d" " "$OUTPUT/$NAME-$TMPNAME2.t" >> "$OUTPUT/$NAME-$TMPNAME2.onecol"
      done

      # data file
      if [ $CNT -eq 1 ]
      then
         paste "$OUTPUT/$NAME-$TMPNAME1.onecol" "$OUTPUT/$NAME-$TMPNAME2.onecol" > $DATAFILE"_"
      else
         paste $DATAFILE "$OUTPUT/$NAME-$TMPNAME1.onecol" "$OUTPUT/$NAME-$TMPNAME2.onecol" > $DATAFILE"_"
      fi
      cp $DATAFILE"_" $DATAFILE
      rm -f $DATAFILE"_"

      # clean up
      rm -f $OUTPUT/$NAME*.t
      rm -f $OUTPUT/$NAME*.onecol

      # gnuplot file
      if [ $CNT -eq 1 ]
      then
         PLOT="plot"
      else
         PLOT="replot"
      fi
      COL1=$((CNT * 2 - 1))
      COL2=$((COL1 + 1))
      echo "$PLOT '$OUTPUT/$DATASET.dat' using $COL1:$COL2 title '$CLASSIFIER'" >> $GNUPLOT
   done
   
   # gnuplot file
   echo "set terminal png size 800 600" >> $GNUPLOT
   echo "set output '$DATASET.png'" >> $GNUPLOT
   echo "set xlabel '$TITLE of 2. classifier on labeled data'" >> $GNUPLOT
   echo "set ylabel '$TITLE of 1. classifier on unlabeled data'" >> $GNUPLOT
   echo "replot" >> $GNUPLOT
done

# execute the scripts?
if [ "$EXECUTE" = "yes" ]
then
   echo
   echo -n "Generating plots"
   for SCRIPT in $OUTPUT/*.gnuplot
   do
      echo -n "."
      gnuplot $SCRIPT
   done
   echo
fi


